<html>
<head>
<script type="text/javascript" src="d3.v3.min.js"></script>
<script type="text/javascript" src="xregexp-all-min.js"></script>
<link rel="stylesheet" href="css/opensans.css" />
<style>
body { font-family: 'Open Sans', sans-serif; }
line.guide { stroke: #eeeeee; opacity: 0.0; }
rect { fill: none; pointer-events: all; }
#modal { visibility: hidden; position: absolute; left: 0px; top: 0px; background: rgba(200,200,200,0.3);
	 min-height: 100%; width: 100%; text-align: center; z-index: 1000; }
#docs { width: 80%; background-color: #ffffff; margin: 20 100 100 100; padding: 10;}
#close { float: right; font-size: x-large; width: 30; height: 30; background-color: #ccc; cursor: pointer;}
#labels { font-size: large; font-weight: bold; }
#words { font-size: medium; margin: 10 60 10 60; }
.doc { font-size: medium; margin: 10 50 10 50; text-align: justify; font-family: "Old Standard TT", Times;}
td, th { padding: 1px 4px;}
</style>
</head>
<body>
	<div id="modal"><div id="docs">
		<div id="close">&times;</div>
		<div id="labels"></div>
    	<div id="words"></div>
	</div></div>
	
	<div id="doc"></div>
	<div id="word"></div>
	<div id="log"></div>
	
        <div id="confusion"></div>
	<div id="stats"></div>
        <div id="loading"><h3>Reading and analyzing texts. Please wait.</h3></div>
<script>

var labels = [];

var labelIDs = d3.map();
var words = [];
var wordIDs = d3.map();

var wordLabelCounts = {};
var labelWordCounts = {};
var wordLabelNonZeros = {};
var labelSums = {};
var smoothing = 0.01;
var logSmoothing = Math.log(smoothing);

var documents = [];

var stopwords = {};

// PMB -- tweaked color scale
var colorScale = d3.scale.linear().domain([-5, -3, -1, 0, 1, 3, 5]).range(['darkred', 'red', 'orange', 'darkgray', 'skyblue', 'royalblue', 'midnightblue']);
var wordColors = {};

var confusion;

/* PMB: Set to true if the input has this specific format for the document IDs: collection_title_author
 *      and if the document texts contain multibyte characters (e.g., Chinese idiographic writing systems). */
var isKT = false;

var wordPattern = XRegExp("\\p{L}[\\p{L}\\p{P}]*\\p{L}", "g");

d3.select("#close")
.on("click", function () { d3.select("#modal").style("visibility", "hidden"); });

function plotConfusion() {
	var left = 50;
	var right = 600;
	var top = 50;
	var bottom = 600;

	var w = 800,
	    h = 800;

        d3.select("#loading").style("visibility", "hidden");

	var vis = d3.select("#confusion")
	.append("svg:svg")
	.attr("width", w)
        .attr("height", h);

	var maxCell = d3.max(confusion, function( row ) { return d3.max(row); });

	var labelScale = d3.scale.ordinal().domain(d3.range(labels.length)).rangePoints([left, right]);
	var radiusScale = d3.scale.sqrt().domain([0, maxCell]).range([0, 2 * right / labels.length]);
        
	labels.forEach(function (label, labelID) {
		vis.append("text")
			.attr("class", "ver")
			.attr("x", labelScale(labelID))
			.attr("y", bottom + 10)
			.attr("transform", "rotate(90," + labelScale(labelID) + "," + (bottom + 10) + ")")
			.style("dominant-baseline", "middle")
			.text(label);
		vis.append("text")
			.attr("class", "hor")
			.attr("x", right + 10)
			.attr("y", labelScale(labelID))
			.style("dominant-baseline", "middle")
			.text(label);
			
		vis.append("line")
			.attr("class", "guide")
			.attr("id", "vert" + labelID)
			.attr("x1", labelScale(labelID)).attr("x2", labelScale(labelID))
			.attr("y1", top - 10).attr("y2", bottom + 10)
			.style("stroke-width", 10)
		vis.append("line")
			.attr("class", "guide")
			.attr("id", "hor" + labelID)
			.attr("y1", labelScale(labelID)).attr("y2", labelScale(labelID))
			.attr("x1", left - 10).attr("x2", right + 10)
			.style("stroke-width", 10)
	});

        confusion.forEach(function(cells, row) {
		cells.forEach(function(count, col) {
		    vis.append("circle")
			.attr("cx", function(link) { return labelScale(col); })
			.attr("cy", function(link) { return labelScale(row); })
			.attr("r", function (link) { return radiusScale(count); })
			.style("fill", function () { return row == col ? "#88f" : "#f88"; })
			.style("opacity", function () { return row == col ? 0.3 : 0.7; });
		});
	});
	
	vis.append("rect")
	.attr("width", w)
	.attr("height", h)
    .on("mousemove", function () {
		vis.selectAll("line.guide").style("opacity", 0.0);
		var mousePoint = d3.mouse(this);
		if (mousePoint[0] >= left && mousePoint[0] <= right) {
			var labelID = Math.round((mousePoint[0] - left) * (labels.length - 0.5) / (right - left));
			d3.select("#vert" + labelID).style("opacity", 1.0);
		} 
		if (mousePoint[1] >= top && mousePoint[1] <= bottom) {
			var labelID = Math.round((mousePoint[1] - top) * (labels.length - 0.5) / (bottom - top));
			d3.select("#hor" + labelID).style("opacity", 1.0);
		} 
	})
    .on("click", function () {
		vis.selectAll("line.guide").style("opacity", 0.0);
		var mousePoint = d3.mouse(this);
		if (mousePoint[0] >= left && mousePoint[0] <= right && 
			mousePoint[1] >= top && mousePoint[1] <= bottom) {
			var maxLabelID = Math.round((mousePoint[0] - left) * (labels.length - 0.5) / (right - left));
			var labelID = Math.round((mousePoint[1] - top) * (labels.length - 0.5) / (bottom - top));
			
                        d3.select("#labels").html("<font color='red'>Original: " + labels[labelID] + "</font>  |  <font color='blue'>NB: " + labels[maxLabelID]+ "</font>");
			showWords(labelID, maxLabelID);
			var docDivs = d3.select("#docs").selectAll("div.doc").data(documents.filter(function (doc) { return doc.maxLabelID == maxLabelID && doc.labelID == labelID; }));
			docDivs.enter().append("div").attr("class", "doc");
			docDivs.exit().remove();
                        docDivs.html(function (doc) {
                          if (isKT) {
                            var docMeta = doc['id'].split("_");
                            var metaString = docMeta[0] + " (" + docMeta[2] + ") " + docMeta[1];
                          } else {
                            metaString = doc['id'];
                          }
                          return '<p><b>' + metaString + ':</b>  ' + colorizeText(doc) + '</p>'; 
                        });
			d3.select("#modal").style("visibility", "visible");
		} 
	});

}

// PMB
function colorizeText(doc) {
        var docText = doc.originalText;
        doc.tokens.forEach(function (wordID) {
                wordText = words[wordID];
                
                if (wordColors.hasOwnProperty(wordID)) {
                        wordText = words[wordID].replace('"', '').replace('(', '').replace(')', '');
                        tokenColor = wordColors[wordID];
                        // Find the token in the originalText
                        // Change it to a span color object
                        if (isKT) {
                          var regEx = new RegExp("(" + wordText + ")", "ig");
                        } else {
                          var regEx = new RegExp("\\b(" + wordText + ")\\b", "ig");
                        }
                        docText = docText.replace(regEx, "<font color='" + tokenColor + "'>$1</font>");
                } 
        });
        return docText;
}

function binaryG2 (x1, x2, n1, n2) {
	var p1 = x1 / n1;
	var p2 = x2 / n2;
	var p = (x1 + x2) / (n1 + n2);
	
	return -2 * ( x1 * Math.log(p / p1) + (n1 - x1) * Math.log((1 - p)/(1 - p1)) + 
				  x2 * Math.log(p / p2) + (n2 - x2) * Math.log((1 - p)/(1 - p2)) );
}

function showWords(leftLabel, rightLabel) {

        // Reset the word colors index 
        wordColors = {}

        var relativeCounts = [];

        leftSum = labelSums[leftLabel];
        rightSum = labelSums[rightLabel];
        var leftThreshold = leftSum * .0001;
        var rightThreshold = rightSum * .0001;
        //console.log(leftLabel + " words is " + leftSum + ", threshold is " + leftThreshold);
        //console.log(rightLabel + " words is " + rightSum + ", threshold is " + rightThreshold);

        var totalWords = words.length;
        var wordThreshold = totalWords * .0001;
        //console.log("total words is " + totalWords + ", threshold is " + wordThreshold);

	words.forEach(function (word, wordID) {
		if (wordLabelCounts[wordID].hasOwnProperty(leftLabel) ||
			wordLabelCounts[wordID].hasOwnProperty(rightLabel)) {
			var leftCount = wordLabelCounts[wordID][leftLabel];
                        var rightCount = wordLabelCounts[wordID][rightLabel];
                        // PMB
                        if (leftLabel == rightLabel) {
                          var logRatio = Math.log(1 / leftCount);
                          var thisColor = colorScale(logRatio);
                          wordColors[wordID] = thisColor;
                          if (labelWordCounts[leftLabel][wordID] > wordThreshold) {
                            relativeCounts.push({wordID: wordID, leftCount: leftCount, rightCount: rightCount, logRatio: logRatio, g2: 4.0 });
                          } else {
                            relativeCounts.push({wordID: wordID, leftCount: leftCount, rightCount: rightCount, logRatio: logRatio, g2: 1.0 });
                          }
                        } else {
			  //if (leftCount > 5 && rightCount > 5)
                          //if (leftCount > wordThreshold && rightCount > wordThreshold)
                          var g2 = binaryG2(leftCount, rightCount, labelSums[leftLabel], labelSums[rightLabel]);
                          var logRatio = Math.log(rightCount / leftCount);
                          relativeCounts.push({wordID: wordID, leftCount: leftCount, rightCount: rightCount, logRatio: logRatio, g2: g2 });
                          var thisColor = colorScale(logRatio);
                          wordColors[wordID] = thisColor;
                          //if ((leftCount > leftThreshold || rightCount > rightThreshold) && (g2 > 3.0)) {	
                                //console.log("leftCount is " + leftCount + ", rightCount is " + rightCount);
                                //console.log("g2 is " + g2);
                          //}
                        }
		}
	});

        var filteredCounts = relativeCounts.filter(function (c) { if ((c.g2 > 3) && ((c.leftCount > leftThreshold) && (c.rightCount > rightThreshold))) return c; });

        var wordDivs = d3.select("#words").selectAll("span").data(filteredCounts.sort(function (a, b) { return a.logRatio - b.logRatio; }));
	wordDivs.enter().append("span");
	wordDivs.exit().remove();
        wordDivs.style("color", function(w) { 
          return wordColors[w.wordID];
        }).text(function (w) { return words[w.wordID] + " "; });

}

function getLabelID(label) {
	if (! labelIDs.has(label)) {
		var labelID = labels.length;
		labels.push(label);
		labelIDs.set(label, labelID);
		labelSums[labelID] = 0;
	}
	
	return labelIDs.get(label);
}

function getWordID(word) {
	if (! wordIDs.has(word)) {
		var wordID = words.length;
		words.push(word);
		wordIDs.set(word, wordID);
		wordLabelCounts[wordID] = {};
		wordLabelNonZeros[wordID] = [];
	}
	
	return wordIDs.get(word);
}

function parseLine( fields, row ) {
	// If it's not in [ID]\t[TAG]\t[TEXT] format...
	if (fields.length != 3) { /* error! */ return; }

	var docID = fields[0];
	var labelID = getLabelID(fields[1]); // interpret as a number
	text = fields[2].replace(/\\n/g, " ");

	var tokens = [];
	var rawTokens = text.toLowerCase().match(wordPattern);
	if (rawTokens == null) { return; }

	var wordID;

	rawTokens.forEach(function (word) {
		if (word !== "" && ! stopwords[word] && word.length > 2) {
			wordID = getWordID(word);
			var counts = wordLabelCounts[wordID];
			if (! counts.hasOwnProperty(labelID)) {
				counts[labelID] = 0;
				wordLabelNonZeros[wordID].push(labelID);
			}
			counts[labelID] += 1;
			labelSums[labelID] += 1;
			tokens.push(wordID);
		}
	});

	return { "originalOrder" : row, "id" : docID, "labelID" : labelID, "originalText" : text, "tokens" : tokens };
};

function leaveOneOut( doc ) {
	
	// First subtract the document from the label's histogram
	doc.tokens.forEach(function (wordID) {
		wordLabelCounts[wordID][doc.labelID] -= 1;
	});
	labelSums[doc.labelID] -= doc.tokens.length;

	doc.labelWeights = Array(labels.length);
	labels.forEach(function (label, labelID) {
		doc.labelWeights[labelID] = doc.tokens.length * (logSmoothing - Math.log(labelSums[labelID] + words.length * smoothing));
	});
	
	// Add up weights for each label
	for (var position = 0; position < doc.tokens.length; position++) {
		var wordID = doc.tokens[position];
		var nonZeros = wordLabelNonZeros[wordID];
		var counts = wordLabelCounts[wordID];
		for (var i = 0; i < nonZeros.length; i++) {
			var labelID = nonZeros[i];
			doc.labelWeights[labelID] += 
				Math.log(counts[labelID] + smoothing) - logSmoothing;
		}
	}
	
	var maxScore = Number.NEGATIVE_INFINITY;
	doc.maxLabelID = -1;
	
	labels.forEach(function (label, labelID) {
		doc.labelWeights[labelID] /= doc.tokens.length;
		if (doc.labelWeights[labelID] > maxScore) {
			maxScore = doc.labelWeights[labelID];
			doc.maxLabelID = labelID;
		}
	});
	
	confusion[doc.labelID][doc.maxLabelID] += 1;
	
	// Add the document back in
	doc.tokens.forEach(function (wordID) {
		wordLabelCounts[wordID][doc.labelID] += 1;
                if (!labelWordCounts.hasOwnProperty(doc.labelID)) {
                  labelWordCounts[doc.labelID] = {wordID: 1};
                } else if (!labelWordCounts[doc.labelID].hasOwnProperty(wordID)) {
                  labelWordCounts[doc.labelID][wordID] = 1;
                } else {
                  labelWordCounts[doc.labelID][wordID] += 1;
                }
	});
        labelSums[doc.labelID] += doc.tokens.length;
}

function tabulate(data, columns) {        
  var statsTable = d3.select("#stats").append("table"),
      thead = statsTable.append("thead"),
      tbody = statsTable.append("tbody");

  thead.append("tr")
       .selectAll("th")
       .data(columns)
       .enter()
       .append("th")
         .text(function(column) { return column; });

  var rows = tbody.selectAll("tr")
      .data(data)
      .enter()
      .append("tr");

  var cells = rows.selectAll("td")
      .data(function(row) {
        return columns.map(function(column) {
          return {column: column, value: row[column]};
        });
      })
      .enter()
      .append("td")
      .html(function(d) { return d.value; });

  return statsTable;

}

function showStats() {

  var TableData = [];

  var labelHumanStories = {};
  var labelNBstories = {};
  var labelMatches = {};
  
  labels.forEach(function (label, labelID) {
    labelHumanStories[labelID] = 0;
    labelNBstories[labelID] = 0;
    labelMatches[labelID] = 0;
  });

  documents.forEach(function (doc) {

    labelHumanStories[doc.labelID] += 1;
    labelNBstories[doc.maxLabelID] += 1;

    if (doc.labelID == doc.maxLabelID) {
      labelMatches[doc.labelID] += 1;
    }
 
  });
  
  var statsColumns = ['Original label', 'Stories w/orig. label', 'Stories w/label from NB', 'Matches', 'Precision', 'Recall', 'F-score'];

  labels.forEach(function (label, labelID) {

    var precision = parseFloat((labelMatches[labelID] / labelNBstories[labelID]).toFixed(4));
    var recall = parseFloat((labelMatches[labelID] / labelHumanStories[labelID]).toFixed(4));
    var fscore = parseFloat((2*((precision * recall) / (precision + recall))).toFixed(4));

    var labelStats = {'Original label': label, 'Stories w/orig. label': labelHumanStories[labelID], 'Stories w/label from NB': labelNBstories[labelID], 'Matches': labelMatches[labelID], 'Precision': precision, 'Recall': recall, 'F-score': fscore};

    TableData.push(labelStats);

  });

  tabulate(TableData, statsColumns);

}

d3.text("input.txt", 
function (error, text) {
	//var start = +new Date();
	documents = d3.tsv.parseRows(text, parseLine);
	//var end = +new Date();
		
        confusion = Array(labels.length);

        for (var row = 0; row < labels.length; row++) {
		confusion[row] = Array(labels.length);
		for (var col = 0; col < labels.length; col++) {
			confusion[row][col] = 0;
		}
	}
	
	var maxCount = d3.max(d3.keys(wordLabelCounts), function (wordID) { return d3.max(d3.values(wordLabelCounts[wordID])); });
        documents.forEach(function (doc) { leaveOneOut(doc); });
        
        plotConfusion();

        showStats();
});

</script>
</body>
</html>
